In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score

%matplotlib inline
In [2]:
np.random.seed(0)
In [3]:
os.getcwd()
Out[3]:
'C:\\Users\\Varsha\\Applications of software'
In [4]:
os.chdir('C:\AOS')
In [5]:
os.getcwd()
Out[5]:
'C:\\AOS'
In [9]:
churn=pd.read_csv('churn.csv')
In [11]:
churn.head()
Out[11]:
mobile_number circle_id loc_og_t2o_mou std_og_t2o_mou loc_ic_t2o_mou last_date_of_month_6 last_date_of_month_7 last_date_of_month_8 last_date_of_month_9 arpu_6 ... sachet_3g_9 fb_user_6 fb_user_7 fb_user_8 fb_user_9 aon aug_vbc_3g jul_vbc_3g jun_vbc_3g sep_vbc_3g
0 7000842753 109 0.0 0.0 0.0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 197.385 ... 0 1.0 1.0 1.0 NaN 968 30.4 0.0 101.20 3.58
1 7001865778 109 0.0 0.0 0.0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 34.047 ... 0 NaN 1.0 1.0 NaN 1006 0.0 0.0 0.00 0.00
2 7001625959 109 0.0 0.0 0.0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 167.690 ... 0 NaN NaN NaN 1.0 1103 0.0 0.0 4.17 0.00
3 7001204172 109 0.0 0.0 0.0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 221.338 ... 0 NaN NaN NaN NaN 2491 0.0 0.0 0.00 0.00
4 7000142493 109 0.0 0.0 0.0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 261.636 ... 0 0.0 NaN NaN NaN 1526 0.0 0.0 0.00 0.00

5 rows × 226 columns

In [13]:
print(churn.shape)
print(churn.info())
(99999, 226)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 226 entries, mobile_number to sep_vbc_3g
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB
None
In [15]:
churn.columns.values
Out[15]:
array(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou',
       'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7',
       'last_date_of_month_8', 'last_date_of_month_9', 'arpu_6', 'arpu_7',
       'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
       'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7', 'offnet_mou_8',
       'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7', 'roam_ic_mou_8',
       'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7', 'roam_og_mou_8',
       'roam_og_mou_9', 'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7',
       'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6',
       'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9',
       'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8',
       'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7',
       'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9', 'loc_og_mou_6',
       'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9', 'std_og_t2t_mou_6',
       'std_og_t2t_mou_7', 'std_og_t2t_mou_8', 'std_og_t2t_mou_9',
       'std_og_t2m_mou_6', 'std_og_t2m_mou_7', 'std_og_t2m_mou_8',
       'std_og_t2m_mou_9', 'std_og_t2f_mou_6', 'std_og_t2f_mou_7',
       'std_og_t2f_mou_8', 'std_og_t2f_mou_9', 'std_og_t2c_mou_6',
       'std_og_t2c_mou_7', 'std_og_t2c_mou_8', 'std_og_t2c_mou_9',
       'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8', 'std_og_mou_9',
       'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8', 'isd_og_mou_9',
       'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8', 'spl_og_mou_9',
       'og_others_6', 'og_others_7', 'og_others_8', 'og_others_9',
       'total_og_mou_6', 'total_og_mou_7', 'total_og_mou_8',
       'total_og_mou_9', 'loc_ic_t2t_mou_6', 'loc_ic_t2t_mou_7',
       'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_6',
       'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8', 'loc_ic_t2m_mou_9',
       'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7', 'loc_ic_t2f_mou_8',
       'loc_ic_t2f_mou_9', 'loc_ic_mou_6', 'loc_ic_mou_7', 'loc_ic_mou_8',
       'loc_ic_mou_9', 'std_ic_t2t_mou_6', 'std_ic_t2t_mou_7',
       'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9', 'std_ic_t2m_mou_6',
       'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8', 'std_ic_t2m_mou_9',
       'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7', 'std_ic_t2f_mou_8',
       'std_ic_t2f_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
       'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9', 'std_ic_mou_6',
       'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9', 'total_ic_mou_6',
       'total_ic_mou_7', 'total_ic_mou_8', 'total_ic_mou_9',
       'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8', 'spl_ic_mou_9',
       'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8', 'isd_ic_mou_9',
       'ic_others_6', 'ic_others_7', 'ic_others_8', 'ic_others_9',
       'total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8',
       'total_rech_num_9', 'total_rech_amt_6', 'total_rech_amt_7',
       'total_rech_amt_8', 'total_rech_amt_9', 'max_rech_amt_6',
       'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9',
       'date_of_last_rech_6', 'date_of_last_rech_7',
       'date_of_last_rech_8', 'date_of_last_rech_9', 'last_day_rch_amt_6',
       'last_day_rch_amt_7', 'last_day_rch_amt_8', 'last_day_rch_amt_9',
       'date_of_last_rech_data_6', 'date_of_last_rech_data_7',
       'date_of_last_rech_data_8', 'date_of_last_rech_data_9',
       'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8',
       'total_rech_data_9', 'max_rech_data_6', 'max_rech_data_7',
       'max_rech_data_8', 'max_rech_data_9', 'count_rech_2g_6',
       'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
       'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8',
       'count_rech_3g_9', 'av_rech_amt_data_6', 'av_rech_amt_data_7',
       'av_rech_amt_data_8', 'av_rech_amt_data_9', 'vol_2g_mb_6',
       'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_2g_mb_9', 'vol_3g_mb_6',
       'vol_3g_mb_7', 'vol_3g_mb_8', 'vol_3g_mb_9', 'arpu_3g_6',
       'arpu_3g_7', 'arpu_3g_8', 'arpu_3g_9', 'arpu_2g_6', 'arpu_2g_7',
       'arpu_2g_8', 'arpu_2g_9', 'night_pck_user_6', 'night_pck_user_7',
       'night_pck_user_8', 'night_pck_user_9', 'monthly_2g_6',
       'monthly_2g_7', 'monthly_2g_8', 'monthly_2g_9', 'sachet_2g_6',
       'sachet_2g_7', 'sachet_2g_8', 'sachet_2g_9', 'monthly_3g_6',
       'monthly_3g_7', 'monthly_3g_8', 'monthly_3g_9', 'sachet_3g_6',
       'sachet_3g_7', 'sachet_3g_8', 'sachet_3g_9', 'fb_user_6',
       'fb_user_7', 'fb_user_8', 'fb_user_9', 'aon', 'aug_vbc_3g',
       'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'], dtype=object)
In [17]:
#grouping columns based on datatypes
churn.columns.to_series().groupby(churn.dtypes).groups
Out[17]:
{int64: ['mobile_number', 'circle_id', 'total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6', 'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9', 'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9', 'last_day_rch_amt_6', 'last_day_rch_amt_7', 'last_day_rch_amt_8', 'last_day_rch_amt_9', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8', 'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8', 'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8', 'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8', 'sachet_3g_9', 'aon'], float64: ['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6', 'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7', 'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7', 'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7', 'roam_og_mou_8', 'roam_og_mou_9', 'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9', 'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9', 'std_og_t2t_mou_6', 'std_og_t2t_mou_7', 'std_og_t2t_mou_8', 'std_og_t2t_mou_9', 'std_og_t2m_mou_6', 'std_og_t2m_mou_7', 'std_og_t2m_mou_8', 'std_og_t2m_mou_9', 'std_og_t2f_mou_6', 'std_og_t2f_mou_7', 'std_og_t2f_mou_8', 'std_og_t2f_mou_9', 'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8', 'std_og_t2c_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8', 'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8', 'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8', 'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8', 'og_others_9', 'total_og_mou_6', 'total_og_mou_7', 'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6', 'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8', 'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7', 'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6', 'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6', 'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9', 'std_ic_t2m_mou_6', ...], object: ['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8', 'last_date_of_month_9', 'date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8', 'date_of_last_rech_9', 'date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'date_of_last_rech_data_9']}

Splitting into three Data Frames based on datatypes - Date, int and float¶

Date Columns¶

In [19]:
#Converting date columns from object to datetime

date_columns=churn[['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8',
        'last_date_of_month_9', 'date_of_last_rech_6', 'date_of_last_rech_7',
        'date_of_last_rech_8', 'date_of_last_rech_9',
        'date_of_last_rech_data_6', 'date_of_last_rech_data_7',
        'date_of_last_rech_data_8', 'date_of_last_rech_data_9']]
In [20]:
date_columns.head()
Out[20]:
last_date_of_month_6 last_date_of_month_7 last_date_of_month_8 last_date_of_month_9 date_of_last_rech_6 date_of_last_rech_7 date_of_last_rech_8 date_of_last_rech_9 date_of_last_rech_data_6 date_of_last_rech_data_7 date_of_last_rech_data_8 date_of_last_rech_data_9
0 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 21/06/2014 0:00 16/07/2014 0:00 08/08/2014 0:00 28/09/2014 0:00 21/06/2014 0:00 16/07/2014 0:00 08/08/2014 0:00 NaN
1 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 29/06/2014 0:00 31/07/2014 0:00 28/08/2014 0:00 30/09/2014 0:00 NaN 25/07/2014 0:00 10/08/2014 0:00 NaN
2 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 17/06/2014 0:00 24/07/2014 0:00 14/08/2014 0:00 29/09/2014 0:00 NaN NaN NaN 17/09/2014 0:00
3 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 28/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 NaN NaN NaN NaN
4 30/06/2014 0:00 31/07/2014 0:00 31/08/2014 0:00 30/09/2014 0:00 26/06/2014 0:00 28/07/2014 0:00 09/08/2014 0:00 28/09/2014 0:00 04/06/2014 0:00 NaN NaN NaN
In [23]:
# unique values in date columns
print(date_columns.nunique())
#converting date columns to datetype format
date_columns=date_columns.astype('datetime64[ns]')
last_date_of_month_6         1
last_date_of_month_7         1
last_date_of_month_8         1
last_date_of_month_9         1
date_of_last_rech_6         30
date_of_last_rech_7         31
date_of_last_rech_8         31
date_of_last_rech_9         30
date_of_last_rech_data_6    30
date_of_last_rech_data_7    31
date_of_last_rech_data_8    31
date_of_last_rech_data_9    30
dtype: int64
In [24]:
print(date_columns.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   last_date_of_month_6      99999 non-null  datetime64[ns]
 1   last_date_of_month_7      99398 non-null  datetime64[ns]
 2   last_date_of_month_8      98899 non-null  datetime64[ns]
 3   last_date_of_month_9      98340 non-null  datetime64[ns]
 4   date_of_last_rech_6       98392 non-null  datetime64[ns]
 5   date_of_last_rech_7       98232 non-null  datetime64[ns]
 6   date_of_last_rech_8       96377 non-null  datetime64[ns]
 7   date_of_last_rech_9       95239 non-null  datetime64[ns]
 8   date_of_last_rech_data_6  25153 non-null  datetime64[ns]
 9   date_of_last_rech_data_7  25571 non-null  datetime64[ns]
 10  date_of_last_rech_data_8  26339 non-null  datetime64[ns]
 11  date_of_last_rech_data_9  25922 non-null  datetime64[ns]
dtypes: datetime64[ns](12)
memory usage: 9.2 MB
None

Deriving new features in DataFrame 1¶

In [25]:
#Filtering Day from date of recharge columns
date_columns['day_of_last_rech_6'] =  date_columns.date_of_last_rech_6.dt.day
date_columns['day_of_last_rech_7'] =  date_columns.date_of_last_rech_7.dt.day
date_columns['day_of_last_rech_8'] =  date_columns.date_of_last_rech_8.dt.day

#Imputing missing day fields with 0
date_columns.loc[:,'day_of_last_rech_6']=date_columns['day_of_last_rech_6'].fillna('0')
date_columns.loc[:,'day_of_last_rech_7']=date_columns['day_of_last_rech_7'].fillna('0')
date_columns.loc[:,'day_of_last_rech_8']=date_columns['day_of_last_rech_8'].fillna('0')
In [26]:
#Dropping last date of month columns as it contains only one value and variance is low
#Dropping Date of last recharge columns as day is extracted and is not useful to us anymore
#Dropping date of lsat recharge data fields as the missing values in > 70 %
#Dropping all columns with suffix _9 as they will not be used

date_columns=date_columns.drop(['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8',
                               'date_of_last_rech_9','last_date_of_month_6','last_date_of_month_7',
                               'last_date_of_month_8','last_date_of_month_9','date_of_last_rech_data_6',
                               'date_of_last_rech_data_7','date_of_last_rech_data_8','date_of_last_rech_data_9']
                               ,axis=1)
In [27]:
print(round(100*(date_columns.isnull().sum()/len(date_columns.index)),2))

#Final date_Columns
date_columns.head()
day_of_last_rech_6    0.0
day_of_last_rech_7    0.0
day_of_last_rech_8    0.0
dtype: float64
Out[27]:
day_of_last_rech_6 day_of_last_rech_7 day_of_last_rech_8
0 21.0 16.0 8.0
1 29.0 31.0 28.0
2 17.0 24.0 14.0
3 28.0 31.0 31.0
4 26.0 28.0 9.0

int group Columns¶

In [30]:
int_columns=churn[['mobile_number', 'circle_id', 'total_rech_num_6', 'total_rech_num_7',
        'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6',
        'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9',
        'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9',
        'last_day_rch_amt_6', 'last_day_rch_amt_7', 'last_day_rch_amt_8',
        'last_day_rch_amt_9', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8',
        'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8',
        'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8',
        'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8',
        'sachet_3g_9', 'aon']]
In [31]:
int_columns.head()
Out[31]:
mobile_number circle_id total_rech_num_6 total_rech_num_7 total_rech_num_8 total_rech_num_9 total_rech_amt_6 total_rech_amt_7 total_rech_amt_8 total_rech_amt_9 ... sachet_2g_9 monthly_3g_6 monthly_3g_7 monthly_3g_8 monthly_3g_9 sachet_3g_6 sachet_3g_7 sachet_3g_8 sachet_3g_9 aon
0 7000842753 109 4 3 2 6 362 252 252 0 ... 0 1 1 1 0 0 0 0 0 968
1 7001865778 109 4 9 11 5 74 384 283 121 ... 0 0 0 0 0 0 0 0 0 1006
2 7001625959 109 5 4 2 7 168 315 116 358 ... 1 0 0 0 0 0 0 0 0 1103
3 7001204172 109 10 11 18 14 230 310 601 410 ... 0 0 0 0 0 0 0 0 0 2491
4 7000142493 109 5 6 3 4 196 350 287 200 ... 0 0 0 0 0 0 0 0 0 1526

5 rows × 35 columns

In [34]:
print(int_columns.nunique()) # Count of unique values in int columns
mobile_number         99999
circle_id                 1
total_rech_num_6        102
total_rech_num_7        101
total_rech_num_8         96
total_rech_num_9         97
total_rech_amt_6       2305
total_rech_amt_7       2329
total_rech_amt_8       2347
total_rech_amt_9       2304
max_rech_amt_6          202
max_rech_amt_7          183
max_rech_amt_8          213
max_rech_amt_9          201
last_day_rch_amt_6      186
last_day_rch_amt_7      173
last_day_rch_amt_8      199
last_day_rch_amt_9      185
monthly_2g_6              5
monthly_2g_7              6
monthly_2g_8              6
monthly_2g_9              5
sachet_2g_6              32
sachet_2g_7              35
sachet_2g_8              34
sachet_2g_9              32
monthly_3g_6             12
monthly_3g_7             15
monthly_3g_8             12
monthly_3g_9             11
sachet_3g_6              25
sachet_3g_7              27
sachet_3g_8              29
sachet_3g_9              27
aon                    3489
dtype: int64
In [35]:
int_columns[['monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8',
        'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8',
        'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8',
        'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8',
        'sachet_3g_9']].describe()
Out[35]:
monthly_2g_6 monthly_2g_7 monthly_2g_8 monthly_2g_9 sachet_2g_6 sachet_2g_7 sachet_2g_8 sachet_2g_9 monthly_3g_6 monthly_3g_7 monthly_3g_8 monthly_3g_9 sachet_3g_6 sachet_3g_7 sachet_3g_8 sachet_3g_9
count 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000 99999.000000
mean 0.079641 0.083221 0.081001 0.068781 0.389384 0.439634 0.450075 0.393104 0.075921 0.078581 0.082941 0.086341 0.074781 0.080401 0.084501 0.084581
std 0.295058 0.304395 0.299568 0.278120 1.497320 1.636230 1.630263 1.347140 0.363371 0.387231 0.384947 0.384978 0.568344 0.628334 0.660234 0.650457
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 4.000000 5.000000 5.000000 4.000000 42.000000 48.000000 44.000000 40.000000 14.000000 16.000000 16.000000 11.000000 29.000000 35.000000 41.000000 49.000000

Deriving new features in DataFrame 2¶

In [36]:
# Conversion of the number of months by 12
int_columns.loc[:,'aon_years']=int_columns.aon/365
int_columns.loc[:,'aon_years'] = int_columns['aon_years'].astype(int)

int_columns.head()
Out[36]:
mobile_number circle_id total_rech_num_6 total_rech_num_7 total_rech_num_8 total_rech_num_9 total_rech_amt_6 total_rech_amt_7 total_rech_amt_8 total_rech_amt_9 ... monthly_3g_6 monthly_3g_7 monthly_3g_8 monthly_3g_9 sachet_3g_6 sachet_3g_7 sachet_3g_8 sachet_3g_9 aon aon_years
0 7000842753 109 4 3 2 6 362 252 252 0 ... 1 1 1 0 0 0 0 0 968 2.0
1 7001865778 109 4 9 11 5 74 384 283 121 ... 0 0 0 0 0 0 0 0 1006 2.0
2 7001625959 109 5 4 2 7 168 315 116 358 ... 0 0 0 0 0 0 0 0 1103 3.0
3 7001204172 109 10 11 18 14 230 310 601 410 ... 0 0 0 0 0 0 0 0 2491 6.0
4 7000142493 109 5 6 3 4 196 350 287 200 ... 0 0 0 0 0 0 0 0 1526 4.0

5 rows × 36 columns

In [37]:
#Deriving New columns for Good Phase(6 and 7 months)
#Sachet subscription
int_columns.loc[:,'sachet_2g_good_ph'] = (int_columns.sachet_2g_6 + int_columns.sachet_2g_7)/2
int_columns.loc[:,'sachet_3g_good_ph'] = (int_columns.sachet_3g_6 + int_columns.sachet_3g_7)/2

#Monthly 2G/3G
int_columns.loc[:,'monthly_2g_good_ph'] = (int_columns.monthly_2g_6 + int_columns.monthly_2g_7)/2
int_columns.loc[:,'monthly_3g_good_ph'] = (int_columns.monthly_3g_6 + int_columns.monthly_3g_7)/2

#max_recaharge amount
int_columns.loc[:,'max_rech_amt_good_ph'] = (int_columns.max_rech_amt_6 + int_columns.max_rech_amt_7)/2
int_columns.loc[:,'total_rech_num_good_ph'] = (int_columns.total_rech_num_6 + int_columns.total_rech_num_7)/2

# Average total recharge for first 2 months
int_columns.loc[:,'tot_rech_amt_good_ph'] = (int_columns.total_rech_amt_6 + int_columns.total_rech_amt_7)/2
#df['avg_tot_rech_3mths'] = (df['total_rech_amt_6']+df['total_rech_amt_7']+df['total_rech_amt_8'])/3

int_columns.loc[:, 'last_day_rch_amt_good_ph'] = (int_columns.last_day_rch_amt_6 + int_columns.last_day_rch_amt_7)/2
In [38]:
#Dropping circle_id column as it contains only one value and variance is low
#Dropping aon column as new column is derived and is no longer useful 
#Dropping all columns with suffix _6 and _7 as new columns derived 
#Dropping all columns with suffix _9 as they will not be used

int_columns=int_columns.drop(['sachet_2g_6','sachet_2g_7','sachet_3g_6',
                               'sachet_3g_7','monthly_2g_6','monthly_2g_7',
                               'monthly_3g_6','monthly_3g_7','max_rech_amt_6',
                               'max_rech_amt_7','total_rech_num_6','total_rech_num_7','total_rech_amt_6','total_rech_amt_7',
                             'aon','circle_id','last_day_rch_amt_6','last_day_rch_amt_7','total_rech_num_9',
                              'total_rech_amt_9','max_rech_amt_9','last_day_rch_amt_9','monthly_2g_9','sachet_2g_9'
                             ,'sachet_3g_9','monthly_3g_9']
                               ,axis=1)
In [39]:
# final columns 
int_columns.columns.values
Out[39]:
array(['mobile_number', 'total_rech_num_8', 'total_rech_amt_8',
       'max_rech_amt_8', 'last_day_rch_amt_8', 'monthly_2g_8',
       'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_8', 'aon_years',
       'sachet_2g_good_ph', 'sachet_3g_good_ph', 'monthly_2g_good_ph',
       'monthly_3g_good_ph', 'max_rech_amt_good_ph',
       'total_rech_num_good_ph', 'tot_rech_amt_good_ph',
       'last_day_rch_amt_good_ph'], dtype=object)

int group Columns¶

In [41]:
#groupingcolumnsbasedondatatypes
#df.columns[colforcoldf.dtypes=='float64']

list(churn.loc[:,churn.dtypes==float])
float_columns=churn[['loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou',
                  'arpu_6','arpu_7','arpu_8','arpu_9',
                  'onnet_mou_6','onnet_mou_7','onnet_mou_8','onnet_mou_9',
                  'offnet_mou_6','offnet_mou_7','offnet_mou_8','offnet_mou_9',
                  'roam_ic_mou_6','roam_ic_mou_7','roam_ic_mou_8','roam_ic_mou_9',
                  'roam_og_mou_6','roam_og_mou_7','roam_og_mou_8','roam_og_mou_9',
                  'loc_og_t2t_mou_6','loc_og_t2t_mou_7','loc_og_t2t_mou_8','loc_og_t2t_mou_9',
                  'loc_og_t2m_mou_6','loc_og_t2m_mou_7','loc_og_t2m_mou_8','loc_og_t2m_mou_9',
                  'loc_og_t2f_mou_6','loc_og_t2f_mou_7','loc_og_t2f_mou_8','loc_og_t2f_mou_9',
                  'loc_og_t2c_mou_6','loc_og_t2c_mou_7','loc_og_t2c_mou_8','loc_og_t2c_mou_9',
                  'loc_og_mou_6','loc_og_mou_7','loc_og_mou_8','loc_og_mou_9',
                  'std_og_t2t_mou_6','std_og_t2t_mou_7','std_og_t2t_mou_8','std_og_t2t_mou_9',
                  'std_og_t2m_mou_6','std_og_t2m_mou_7','std_og_t2m_mou_8','std_og_t2m_mou_9',
                  'std_og_t2f_mou_6','std_og_t2f_mou_7','std_og_t2f_mou_8','std_og_t2f_mou_9',
                  'std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8','std_og_t2c_mou_9',
                  'std_og_mou_6','std_og_mou_7','std_og_mou_8','std_og_mou_9','isd_og_mou_6',
                  'isd_og_mou_7','isd_og_mou_8','isd_og_mou_9',
                  'spl_og_mou_6','spl_og_mou_7','spl_og_mou_8','spl_og_mou_9',
                  'og_others_6','og_others_7','og_others_8','og_others_9',
                  'total_og_mou_6','total_og_mou_7','total_og_mou_8','total_og_mou_9',
                  'loc_ic_t2t_mou_6','loc_ic_t2t_mou_7','loc_ic_t2t_mou_8','loc_ic_t2t_mou_9',
                  'loc_ic_t2m_mou_6','loc_ic_t2m_mou_7','loc_ic_t2m_mou_8','loc_ic_t2m_mou_9',
                  'loc_ic_t2f_mou_6','loc_ic_t2f_mou_7','loc_ic_t2f_mou_8','loc_ic_t2f_mou_9',
                  'loc_ic_mou_6','loc_ic_mou_7','loc_ic_mou_8','loc_ic_mou_9',
                  'std_ic_t2t_mou_6','std_ic_t2t_mou_7','std_ic_t2t_mou_8','std_ic_t2t_mou_9',
                  'std_ic_t2m_mou_6','std_ic_t2m_mou_7','std_ic_t2m_mou_8','std_ic_t2m_mou_9',
                  'std_ic_t2f_mou_6','std_ic_t2f_mou_7','std_ic_t2f_mou_8','std_ic_t2f_mou_9',
                  'std_ic_t2o_mou_6','std_ic_t2o_mou_7','std_ic_t2o_mou_8','std_ic_t2o_mou_9',
                  'std_ic_mou_6','std_ic_mou_7','std_ic_mou_8','std_ic_mou_9',
                  'total_ic_mou_6','total_ic_mou_7','total_ic_mou_8','total_ic_mou_9',
                  'spl_ic_mou_6','spl_ic_mou_7','spl_ic_mou_8','spl_ic_mou_9',
                  'isd_ic_mou_6','isd_ic_mou_7','isd_ic_mou_8','isd_ic_mou_9',
                  'ic_others_6','ic_others_7','ic_others_8','ic_others_9',
                  'total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
                  'max_rech_data_6','max_rech_data_7','max_rech_data_8','max_rech_data_9',
                  'count_rech_2g_6','count_rech_2g_7','count_rech_2g_8','count_rech_2g_9',
                  'count_rech_3g_6','count_rech_3g_7','count_rech_3g_8','count_rech_3g_9',
                  'av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9'
                  ,'vol_2g_mb_6','vol_2g_mb_7','vol_2g_mb_8','vol_2g_mb_9',
                  'vol_3g_mb_6','vol_3g_mb_7','vol_3g_mb_8','vol_3g_mb_9','arpu_3g_6','arpu_3g_7','arpu_3g_8','arpu_3g_9','arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','night_pck_user_6','night_pck_user_7','night_pck_user_8','night_pck_user_9','fb_user_6','fb_user_7','fb_user_8','fb_user_9','aug_vbc_3g','jul_vbc_3g','jun_vbc_3g','sep_vbc_3g']]
                 
In [42]:
#Checking for null values

float_null= round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
float_null[float_null > 10]
Out[42]:
total_rech_data_6     74.85
total_rech_data_7     74.43
total_rech_data_8     73.66
total_rech_data_9     74.08
max_rech_data_6       74.85
max_rech_data_7       74.43
max_rech_data_8       73.66
max_rech_data_9       74.08
count_rech_2g_6       74.85
count_rech_2g_7       74.43
count_rech_2g_8       73.66
count_rech_2g_9       74.08
count_rech_3g_6       74.85
count_rech_3g_7       74.43
count_rech_3g_8       73.66
count_rech_3g_9       74.08
av_rech_amt_data_6    74.85
av_rech_amt_data_7    74.43
av_rech_amt_data_8    73.66
av_rech_amt_data_9    74.08
arpu_3g_6             74.85
arpu_3g_7             74.43
arpu_3g_8             73.66
arpu_3g_9             74.08
arpu_2g_6             74.85
arpu_2g_7             74.43
arpu_2g_8             73.66
arpu_2g_9             74.08
night_pck_user_6      74.85
night_pck_user_7      74.43
night_pck_user_8      73.66
night_pck_user_9      74.08
fb_user_6             74.85
fb_user_7             74.43
fb_user_8             73.66
fb_user_9             74.08
dtype: float64
In [43]:
# Dropping the above columns with more than 70% missing values

float_columns=float_columns.drop(['total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
                              'max_rech_data_6','max_rech_data_7','max_rech_data_8','max_rech_data_9','count_rech_2g_6',
                              'count_rech_2g_7','count_rech_2g_8','count_rech_2g_9','count_rech_3g_6','count_rech_3g_7',
                              'count_rech_3g_8','count_rech_3g_9','av_rech_amt_data_6','av_rech_amt_data_7',
                              'av_rech_amt_data_8','av_rech_amt_data_9','arpu_3g_6','arpu_3g_7','arpu_3g_8','arpu_3g_9',
                              'arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','night_pck_user_6','night_pck_user_7',
                              'night_pck_user_8','night_pck_user_9','fb_user_6','fb_user_7','fb_user_8','fb_user_9'],axis=1)
In [44]:
#Remaining Columns for analysis 
float_columns.columns.values
Out[44]:
array(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
       'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7',
       'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7',
       'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7',
       'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7',
       'roam_og_mou_8', 'roam_og_mou_9', 'loc_og_t2t_mou_6',
       'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9',
       'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8',
       'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7',
       'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6',
       'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9',
       'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9',
       'std_og_t2t_mou_6', 'std_og_t2t_mou_7', 'std_og_t2t_mou_8',
       'std_og_t2t_mou_9', 'std_og_t2m_mou_6', 'std_og_t2m_mou_7',
       'std_og_t2m_mou_8', 'std_og_t2m_mou_9', 'std_og_t2f_mou_6',
       'std_og_t2f_mou_7', 'std_og_t2f_mou_8', 'std_og_t2f_mou_9',
       'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
       'std_og_t2c_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8',
       'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8',
       'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8',
       'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8',
       'og_others_9', 'total_og_mou_6', 'total_og_mou_7',
       'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6',
       'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9',
       'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8',
       'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7',
       'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6',
       'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6',
       'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9',
       'std_ic_t2m_mou_6', 'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8',
       'std_ic_t2m_mou_9', 'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7',
       'std_ic_t2f_mou_8', 'std_ic_t2f_mou_9', 'std_ic_t2o_mou_6',
       'std_ic_t2o_mou_7', 'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9',
       'std_ic_mou_6', 'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9',
       'total_ic_mou_6', 'total_ic_mou_7', 'total_ic_mou_8',
       'total_ic_mou_9', 'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8',
       'spl_ic_mou_9', 'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8',
       'isd_ic_mou_9', 'ic_others_6', 'ic_others_7', 'ic_others_8',
       'ic_others_9', 'vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8',
       'vol_2g_mb_9', 'vol_3g_mb_6', 'vol_3g_mb_7', 'vol_3g_mb_8',
       'vol_3g_mb_9', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g',
       'sep_vbc_3g'], dtype=object)
In [45]:
# Checking for zero variance columns
float_columns.std()[float_columns.std()==0].index.values
Out[45]:
array(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
       'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
       'std_og_t2c_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
       'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9'], dtype=object)
In [46]:
#Dropping these variable
float_columns=float_columns.drop(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
       'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
       'std_og_t2c_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
       'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9'],axis=1)
In [47]:
#Remaining Columns for analysis
float_columns.columns.values
Out[47]:
array(['arpu_6', 'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6',
       'onnet_mou_7', 'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6',
       'offnet_mou_7', 'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6',
       'roam_ic_mou_7', 'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6',
       'roam_og_mou_7', 'roam_og_mou_8', 'roam_og_mou_9',
       'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8',
       'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7',
       'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6',
       'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9',
       'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8',
       'loc_og_t2c_mou_9', 'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8',
       'loc_og_mou_9', 'std_og_t2t_mou_6', 'std_og_t2t_mou_7',
       'std_og_t2t_mou_8', 'std_og_t2t_mou_9', 'std_og_t2m_mou_6',
       'std_og_t2m_mou_7', 'std_og_t2m_mou_8', 'std_og_t2m_mou_9',
       'std_og_t2f_mou_6', 'std_og_t2f_mou_7', 'std_og_t2f_mou_8',
       'std_og_t2f_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8',
       'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8',
       'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8',
       'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8',
       'og_others_9', 'total_og_mou_6', 'total_og_mou_7',
       'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6',
       'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9',
       'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8',
       'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7',
       'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6',
       'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6',
       'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9',
       'std_ic_t2m_mou_6', 'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8',
       'std_ic_t2m_mou_9', 'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7',
       'std_ic_t2f_mou_8', 'std_ic_t2f_mou_9', 'std_ic_mou_6',
       'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9', 'total_ic_mou_6',
       'total_ic_mou_7', 'total_ic_mou_8', 'total_ic_mou_9',
       'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8', 'spl_ic_mou_9',
       'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8', 'isd_ic_mou_9',
       'ic_others_6', 'ic_others_7', 'ic_others_8', 'ic_others_9',
       'vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_2g_mb_9',
       'vol_3g_mb_6', 'vol_3g_mb_7', 'vol_3g_mb_8', 'vol_3g_mb_9',
       'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'],
      dtype=object)
In [48]:
#Checking for missing values 
missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0.1]
Out[48]:
onnet_mou_6     3.94
onnet_mou_7     3.86
onnet_mou_8     5.38
onnet_mou_9     7.75
offnet_mou_6    3.94
                ... 
isd_ic_mou_9    7.75
ic_others_6     3.94
ic_others_7     3.86
ic_others_8     5.38
ic_others_9     7.75
Length: 108, dtype: float64
In [49]:
#Imputing missing values 
#Since the minutes of usage columns are all numerical, imputing missing values with zero

mou_cols = float_columns.columns[float_columns.columns.str.contains('mou')]
float_columns.loc[:,mou_cols] = float_columns.loc[:,mou_cols].replace(np.NaN,0)

missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0.1]
Out[49]:
og_others_6    3.94
og_others_7    3.86
og_others_8    5.38
og_others_9    7.75
ic_others_6    3.94
ic_others_7    3.86
ic_others_8    5.38
ic_others_9    7.75
dtype: float64
In [50]:
#Imputing missing values 

others_cols = float_columns.columns[float_columns.columns.str.contains('others')]
float_columns.loc[:,others_cols] = float_columns.loc[:,others_cols].replace(np.NaN,0)

missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0]
Out[50]:
Series([], dtype: float64)
In [51]:
#Checking Incoming and Outgoing columns

ic_cols = float_columns.columns[float_columns.columns.str.contains('ic_mou_6')]
print(ic_cols)

float_columns.loc[:,ic_cols].head()

ic_all=float_columns.roam_ic_mou_6+ float_columns.loc_ic_mou_6+float_columns.std_ic_mou_6+ float_columns.spl_ic_mou_6+float_columns.isd_ic_mou_6
print(ic_all.head())
print(float_columns.total_ic_mou_6.head())

#Since the total incoming column is equal to the sum of other
Index(['roam_ic_mou_6', 'loc_ic_mou_6', 'std_ic_mou_6', 'total_ic_mou_6',
       'spl_ic_mou_6', 'isd_ic_mou_6'],
      dtype='object')
0      0.00
1     26.82
2    369.10
3    280.06
4    215.96
dtype: float64
0      0.00
1     26.83
2    370.04
3    280.08
4    216.44
Name: total_ic_mou_6, dtype: float64

Deriving new features in DataFrame 3¶

In [52]:
## Deriving new features from existing columns 

# Average of first 2 months 
float_columns.loc[:,'arpu_good_ph'] = (float_columns['arpu_6']+float_columns['arpu_7'])/2
float_columns.loc[:,'spl_ic_mou_good_ph'] = (float_columns.spl_ic_mou_6 + float_columns.spl_ic_mou_7)/2
float_columns.loc[:,'isd_ic_mou_good_ph'] = (float_columns.isd_ic_mou_6 + float_columns.isd_ic_mou_7)/2
float_columns.loc[:,'roam_ic_mou_good_ph'] = (float_columns.roam_ic_mou_6 + float_columns.roam_ic_mou_7)/2
float_columns.loc[:,'std_ic_t2m_mou_good_ph'] = (float_columns.std_ic_t2m_mou_6 + float_columns.std_ic_t2m_mou_7)/2
float_columns.loc[:,'std_ic_t2t_mou_good_ph'] = (float_columns.std_ic_t2t_mou_6 + float_columns.std_ic_t2t_mou_7)/2
float_columns.loc[:,'std_ic_t2f_mou_good_ph'] = (float_columns.std_ic_t2f_mou_6 + float_columns.std_ic_t2f_mou_7)/2
float_columns.loc[:,'loc_ic_t2m_mou_good_ph'] = (float_columns.loc_ic_t2m_mou_6 + float_columns.loc_ic_t2m_mou_7)/2
float_columns.loc[:,'loc_ic_t2t_mou_good_ph'] = (float_columns.loc_ic_t2t_mou_6 + float_columns.loc_ic_t2t_mou_7)/2
float_columns.loc[:,'loc_ic_t2f_mou_good_ph'] = (float_columns.loc_ic_t2f_mou_6 + float_columns.loc_ic_t2f_mou_7)/2
float_columns.loc[:,'offnet_mou_good_ph'] = (float_columns.offnet_mou_6 + float_columns.offnet_mou_7)/2
float_columns.loc[:,'onnet_mou_good_ph'] = (float_columns.onnet_mou_6 + float_columns.onnet_mou_7)/2
float_columns.loc[:,'og_others_good_ph'] = (float_columns.og_others_6 + float_columns.og_others_7)/2
float_columns.loc[:,'ic_others_good_ph'] = (float_columns.ic_others_6 + float_columns.ic_others_7)/2
float_columns.loc[:,'spl_og_mou_good_ph'] = (float_columns.spl_og_mou_6 + float_columns.spl_og_mou_7)/2
float_columns.loc[:,'isd_og_mou_good_ph'] = (float_columns.isd_og_mou_6 + float_columns.isd_og_mou_7)/2
float_columns.loc[:,'roam_og_mou_good_ph'] = (float_columns.roam_og_mou_6 + float_columns.roam_og_mou_7)/2
float_columns.loc[:,'std_og_t2m_mou_good_ph'] = (float_columns.std_og_t2m_mou_6 + float_columns.std_og_t2m_mou_7)/2
float_columns.loc[:,'std_og_t2t_mou_good_ph'] = (float_columns.std_og_t2t_mou_6 + float_columns.std_og_t2t_mou_7)/2
float_columns.loc[:,'std_og_t2f_mou_good_ph'] = (float_columns.std_og_t2f_mou_6 + float_columns.std_og_t2f_mou_7)/2
float_columns.loc[:,'loc_og_t2m_mou_good_ph'] = (float_columns.loc_og_t2m_mou_6 + float_columns.loc_og_t2m_mou_7)/2
float_columns.loc[:,'loc_og_t2t_mou_good_ph'] = (float_columns.loc_og_t2t_mou_6 + float_columns.loc_og_t2t_mou_7)/2
float_columns.loc[:,'loc_og_t2f_mou_good_ph'] = (float_columns.loc_og_t2f_mou_6 + float_columns.loc_og_t2f_mou_7)/2
float_columns.loc[:,'loc_og_t2c_mou_good_ph'] = (float_columns.loc_og_t2c_mou_6 + float_columns.loc_og_t2c_mou_7)/2
float_columns.loc[:,'total_ic_mou_good_ph'] = (float_columns.total_ic_mou_6 + float_columns.total_ic_mou_7)/2
float_columns.loc[:,'total_og_mou_good_ph'] = (float_columns.total_og_mou_6 + float_columns.total_og_mou_7)/2
float_columns.loc[:,'vbc_3g_good_ph'] = (float_columns.jun_vbc_3g + float_columns.jul_vbc_3g)/2
float_columns.loc[:,'std_ic_mou_good_ph'] = (float_columns.std_ic_mou_6 + float_columns.std_ic_mou_7)/2
float_columns.loc[:,'loc_ic_mou_good_ph'] = (float_columns.loc_ic_mou_6 + float_columns.loc_ic_mou_7)/2
float_columns.loc[:,'std_og_mou_good_ph'] = (float_columns.std_og_mou_6 + float_columns.std_og_mou_7)/2
float_columns.loc[:,'loc_og_mou_good_ph'] = (float_columns.loc_og_mou_6 + float_columns.loc_og_mou_7)/2
float_columns.loc[:,'vol_2g_mb_good_ph'] = (float_columns.vol_2g_mb_6 + float_columns.vol_2g_mb_7)/2
float_columns.loc[:,'vol_3g_mb_good_ph'] = (float_columns.vol_3g_mb_6 + float_columns.vol_3g_mb_7)/2

c)Tag churners and remove attributes of the churn phase: Now tag the churned customers (churn=1, else 0) based on the fourth month as follows: Those who have not made any calls (either incoming or outgoing) AND have not used mobile internet even once in the churn phase. The attributes you need to use to tag churners are:¶

total_ic_mou_9¶

total_og_mou_9¶

vol_2g_mb_9¶

vol_3g_mb_9¶

After tagging churners, remove all the attributes corresponding to the churn phase (all attributes having ‘ _9’, etc. in their names).¶

Deriving Churner tags¶

In [53]:
#Grouping  attributes needed to use to tag churners 
usage_cols=['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']

def churn_calc(row):
    return 1 if (row['total_ic_mou_9']==0) & (row['total_og_mou_9']==0) & (row['vol_2g_mb_9']==0) & (row['vol_3g_mb_9']==0) else 0

#Deriving Churner tags
float_columns['churn']=float_columns[usage_cols].apply(churn_calc,axis=1)
In [54]:
float_columns['churn'].value_counts()
Out[54]:
churn
0    89808
1    10191
Name: count, dtype: int64
In [56]:
#We are calculating threshold based on the normal distribution to detect the drop in telecom usage.
In [57]:
def phase_calc(row):
    
    #total_ic_mou_6
    ic_std=np.std([row['total_ic_mou_6'],row['total_ic_mou_7']])
    ic_mean=np.mean([row['total_ic_mou_6'],row['total_ic_mou_7']])
    ic_threshold=ic_mean-ic_std
    
    #total_og_mou_6
    og_std=np.std([row['total_og_mou_6'],row['total_og_mou_7']])
    og_mean=np.mean([row['total_og_mou_6'],row['total_og_mou_7']])
    og_threshold=og_mean-og_std
    
    #vol_2g_mb_6
    two_g_std=np.std([row['vol_2g_mb_6'],row['vol_2g_mb_7']])
    two_g_mean=np.mean([row['vol_2g_mb_6'],row['vol_2g_mb_7']])
    two_g_threshold=two_g_mean-two_g_std
    
    #vol_3g_mb_6
    three_g_std=np.std([row['vol_3g_mb_6'],row['vol_3g_mb_7']])
    three_g_mean=np.mean([row['vol_3g_mb_6'],row['vol_3g_mb_7']])
    three_g_threshold=three_g_mean-three_g_std
    
    if (row['total_ic_mou_8'] < ic_threshold) | (row['total_og_mou_8'] < og_threshold)\
        | (row['vol_2g_mb_8'] < two_g_threshold) | (row['vol_3g_mb_8'] < three_g_threshold):
        return 1
    else:
        return 0

float_columns['action_required']=float_columns.apply(phase_calc,axis=1)
In [59]:
float_columns['action_required'].value_counts()
Out[59]:
action_required
1    51814
0    48185
Name: count, dtype: int64
In [60]:
# Dropping the _9 suffix variables
# Dropping the _6 and _7 suffix variables because we have derived the columns from these variables

float_columns=float_columns.drop(['spl_ic_mou_6','isd_ic_mou_6','roam_ic_mou_6','std_ic_t2m_mou_6','std_ic_t2t_mou_6',
                                  'std_ic_t2f_mou_6','loc_ic_t2m_mou_6','loc_ic_t2t_mou_6','loc_ic_t2f_mou_6',
                                  'offnet_mou_6','onnet_mou_6','og_others_6','ic_others_7','spl_og_mou_6','isd_og_mou_6',
                                  'roam_og_mou_6','std_og_t2m_mou_6','std_og_t2t_mou_6','std_og_t2f_mou_6',
                                  'loc_og_t2m_mou_6','loc_og_t2t_mou_6','loc_og_t2f_mou_6','loc_og_t2c_mou_6',
                                  'total_ic_mou_6','total_og_mou_6','spl_ic_mou_7','isd_ic_mou_7','roam_ic_mou_7',
                                  'std_ic_t2m_mou_7','std_ic_t2t_mou_7','std_ic_t2f_mou_7','loc_ic_t2m_mou_7',
                                  'loc_ic_t2t_mou_7','loc_ic_t2f_mou_7','offnet_mou_7','onnet_mou_7','og_others_7',
                                  'ic_others_6','spl_og_mou_7','isd_og_mou_7','roam_og_mou_7','std_og_t2m_mou_7',
                                  'std_og_t2t_mou_7','std_og_t2f_mou_7','loc_og_t2m_mou_7','loc_og_t2t_mou_7',
                                  'loc_og_t2f_mou_7','loc_og_t2c_mou_7','total_ic_mou_7','total_og_mou_7','arpu_9', 
                                  'onnet_mou_9', 'offnet_mou_9', 'roam_ic_mou_9','roam_og_mou_9', 'loc_og_t2t_mou_9',
                                  'loc_og_t2m_mou_9','loc_og_t2f_mou_9', 'loc_og_t2c_mou_9', 'loc_og_mou_9',
                                  'std_og_t2t_mou_9', 'std_og_t2m_mou_9', 'std_og_t2f_mou_9','std_og_mou_9', 'isd_og_mou_9',
                                  'spl_og_mou_9', 'og_others_9','total_og_mou_9', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_9',
                                  'loc_ic_t2f_mou_9', 'loc_ic_mou_9', 'std_ic_t2t_mou_9','std_ic_t2m_mou_9', 
                                  'std_ic_t2f_mou_9', 'std_ic_mou_9','vol_2g_mb_6', 'vol_2g_mb_7','vol_3g_mb_6', 'vol_3g_mb_7',
                                  'total_ic_mou_9', 'spl_ic_mou_9', 'isd_ic_mou_9', 'ic_others_9','vol_2g_mb_9', 
                                  'vol_3g_mb_9','arpu_6', 'arpu_7','std_ic_mou_6','std_ic_mou_7',
                                  'loc_ic_mou_6','loc_ic_mou_7','std_og_mou_6','std_og_mou_7','loc_og_mou_6',
                                  'loc_og_mou_7','jul_vbc_3g', 'jun_vbc_3g','sep_vbc_3g'],axis=1)
In [61]:
#Final columns in dataframe 3 : float_columns
float_columns.columns.values
Out[61]:
array(['arpu_8', 'onnet_mou_8', 'offnet_mou_8', 'roam_ic_mou_8',
       'roam_og_mou_8', 'loc_og_t2t_mou_8', 'loc_og_t2m_mou_8',
       'loc_og_t2f_mou_8', 'loc_og_t2c_mou_8', 'loc_og_mou_8',
       'std_og_t2t_mou_8', 'std_og_t2m_mou_8', 'std_og_t2f_mou_8',
       'std_og_mou_8', 'isd_og_mou_8', 'spl_og_mou_8', 'og_others_8',
       'total_og_mou_8', 'loc_ic_t2t_mou_8', 'loc_ic_t2m_mou_8',
       'loc_ic_t2f_mou_8', 'loc_ic_mou_8', 'std_ic_t2t_mou_8',
       'std_ic_t2m_mou_8', 'std_ic_t2f_mou_8', 'std_ic_mou_8',
       'total_ic_mou_8', 'spl_ic_mou_8', 'isd_ic_mou_8', 'ic_others_8',
       'vol_2g_mb_8', 'vol_3g_mb_8', 'aug_vbc_3g', 'arpu_good_ph',
       'spl_ic_mou_good_ph', 'isd_ic_mou_good_ph', 'roam_ic_mou_good_ph',
       'std_ic_t2m_mou_good_ph', 'std_ic_t2t_mou_good_ph',
       'std_ic_t2f_mou_good_ph', 'loc_ic_t2m_mou_good_ph',
       'loc_ic_t2t_mou_good_ph', 'loc_ic_t2f_mou_good_ph',
       'offnet_mou_good_ph', 'onnet_mou_good_ph', 'og_others_good_ph',
       'ic_others_good_ph', 'spl_og_mou_good_ph', 'isd_og_mou_good_ph',
       'roam_og_mou_good_ph', 'std_og_t2m_mou_good_ph',
       'std_og_t2t_mou_good_ph', 'std_og_t2f_mou_good_ph',
       'loc_og_t2m_mou_good_ph', 'loc_og_t2t_mou_good_ph',
       'loc_og_t2f_mou_good_ph', 'loc_og_t2c_mou_good_ph',
       'total_ic_mou_good_ph', 'total_og_mou_good_ph', 'vbc_3g_good_ph',
       'std_ic_mou_good_ph', 'loc_ic_mou_good_ph', 'std_og_mou_good_ph',
       'loc_og_mou_good_ph', 'vol_2g_mb_good_ph', 'vol_3g_mb_good_ph',
       'churn', 'action_required'], dtype=object)

Concatinating the three data frames to create Master data frame¶

In [62]:
master_frame = pd.concat([int_columns,date_columns,float_columns], axis=1)
In [63]:
master_frame.columns.values
Out[63]:
array(['mobile_number', 'total_rech_num_8', 'total_rech_amt_8',
       'max_rech_amt_8', 'last_day_rch_amt_8', 'monthly_2g_8',
       'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_8', 'aon_years',
       'sachet_2g_good_ph', 'sachet_3g_good_ph', 'monthly_2g_good_ph',
       'monthly_3g_good_ph', 'max_rech_amt_good_ph',
       'total_rech_num_good_ph', 'tot_rech_amt_good_ph',
       'last_day_rch_amt_good_ph', 'day_of_last_rech_6',
       'day_of_last_rech_7', 'day_of_last_rech_8', 'arpu_8',
       'onnet_mou_8', 'offnet_mou_8', 'roam_ic_mou_8', 'roam_og_mou_8',
       'loc_og_t2t_mou_8', 'loc_og_t2m_mou_8', 'loc_og_t2f_mou_8',
       'loc_og_t2c_mou_8', 'loc_og_mou_8', 'std_og_t2t_mou_8',
       'std_og_t2m_mou_8', 'std_og_t2f_mou_8', 'std_og_mou_8',
       'isd_og_mou_8', 'spl_og_mou_8', 'og_others_8', 'total_og_mou_8',
       'loc_ic_t2t_mou_8', 'loc_ic_t2m_mou_8', 'loc_ic_t2f_mou_8',
       'loc_ic_mou_8', 'std_ic_t2t_mou_8', 'std_ic_t2m_mou_8',
       'std_ic_t2f_mou_8', 'std_ic_mou_8', 'total_ic_mou_8',
       'spl_ic_mou_8', 'isd_ic_mou_8', 'ic_others_8', 'vol_2g_mb_8',
       'vol_3g_mb_8', 'aug_vbc_3g', 'arpu_good_ph', 'spl_ic_mou_good_ph',
       'isd_ic_mou_good_ph', 'roam_ic_mou_good_ph',
       'std_ic_t2m_mou_good_ph', 'std_ic_t2t_mou_good_ph',
       'std_ic_t2f_mou_good_ph', 'loc_ic_t2m_mou_good_ph',
       'loc_ic_t2t_mou_good_ph', 'loc_ic_t2f_mou_good_ph',
       'offnet_mou_good_ph', 'onnet_mou_good_ph', 'og_others_good_ph',
       'ic_others_good_ph', 'spl_og_mou_good_ph', 'isd_og_mou_good_ph',
       'roam_og_mou_good_ph', 'std_og_t2m_mou_good_ph',
       'std_og_t2t_mou_good_ph', 'std_og_t2f_mou_good_ph',
       'loc_og_t2m_mou_good_ph', 'loc_og_t2t_mou_good_ph',
       'loc_og_t2f_mou_good_ph', 'loc_og_t2c_mou_good_ph',
       'total_ic_mou_good_ph', 'total_og_mou_good_ph', 'vbc_3g_good_ph',
       'std_ic_mou_good_ph', 'loc_ic_mou_good_ph', 'std_og_mou_good_ph',
       'loc_og_mou_good_ph', 'vol_2g_mb_good_ph', 'vol_3g_mb_good_ph',
       'churn', 'action_required'], dtype=object)
In [64]:
print(master_frame.shape)
print(master_frame.info())
(99999, 89)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 89 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   mobile_number             99999 non-null  int64  
 1   total_rech_num_8          99999 non-null  int64  
 2   total_rech_amt_8          99999 non-null  int64  
 3   max_rech_amt_8            99999 non-null  int64  
 4   last_day_rch_amt_8        99999 non-null  int64  
 5   monthly_2g_8              99999 non-null  int64  
 6   sachet_2g_8               99999 non-null  int64  
 7   monthly_3g_8              99999 non-null  int64  
 8   sachet_3g_8               99999 non-null  int64  
 9   aon_years                 99999 non-null  float64
 10  sachet_2g_good_ph         99999 non-null  float64
 11  sachet_3g_good_ph         99999 non-null  float64
 12  monthly_2g_good_ph        99999 non-null  float64
 13  monthly_3g_good_ph        99999 non-null  float64
 14  max_rech_amt_good_ph      99999 non-null  float64
 15  total_rech_num_good_ph    99999 non-null  float64
 16  tot_rech_amt_good_ph      99999 non-null  float64
 17  last_day_rch_amt_good_ph  99999 non-null  float64
 18  day_of_last_rech_6        99999 non-null  object 
 19  day_of_last_rech_7        99999 non-null  object 
 20  day_of_last_rech_8        99999 non-null  object 
 21  arpu_8                    99999 non-null  float64
 22  onnet_mou_8               99999 non-null  float64
 23  offnet_mou_8              99999 non-null  float64
 24  roam_ic_mou_8             99999 non-null  float64
 25  roam_og_mou_8             99999 non-null  float64
 26  loc_og_t2t_mou_8          99999 non-null  float64
 27  loc_og_t2m_mou_8          99999 non-null  float64
 28  loc_og_t2f_mou_8          99999 non-null  float64
 29  loc_og_t2c_mou_8          99999 non-null  float64
 30  loc_og_mou_8              99999 non-null  float64
 31  std_og_t2t_mou_8          99999 non-null  float64
 32  std_og_t2m_mou_8          99999 non-null  float64
 33  std_og_t2f_mou_8          99999 non-null  float64
 34  std_og_mou_8              99999 non-null  float64
 35  isd_og_mou_8              99999 non-null  float64
 36  spl_og_mou_8              99999 non-null  float64
 37  og_others_8               99999 non-null  float64
 38  total_og_mou_8            99999 non-null  float64
 39  loc_ic_t2t_mou_8          99999 non-null  float64
 40  loc_ic_t2m_mou_8          99999 non-null  float64
 41  loc_ic_t2f_mou_8          99999 non-null  float64
 42  loc_ic_mou_8              99999 non-null  float64
 43  std_ic_t2t_mou_8          99999 non-null  float64
 44  std_ic_t2m_mou_8          99999 non-null  float64
 45  std_ic_t2f_mou_8          99999 non-null  float64
 46  std_ic_mou_8              99999 non-null  float64
 47  total_ic_mou_8            99999 non-null  float64
 48  spl_ic_mou_8              99999 non-null  float64
 49  isd_ic_mou_8              99999 non-null  float64
 50  ic_others_8               99999 non-null  float64
 51  vol_2g_mb_8               99999 non-null  float64
 52  vol_3g_mb_8               99999 non-null  float64
 53  aug_vbc_3g                99999 non-null  float64
 54  arpu_good_ph              99999 non-null  float64
 55  spl_ic_mou_good_ph        99999 non-null  float64
 56  isd_ic_mou_good_ph        99999 non-null  float64
 57  roam_ic_mou_good_ph       99999 non-null  float64
 58  std_ic_t2m_mou_good_ph    99999 non-null  float64
 59  std_ic_t2t_mou_good_ph    99999 non-null  float64
 60  std_ic_t2f_mou_good_ph    99999 non-null  float64
 61  loc_ic_t2m_mou_good_ph    99999 non-null  float64
 62  loc_ic_t2t_mou_good_ph    99999 non-null  float64
 63  loc_ic_t2f_mou_good_ph    99999 non-null  float64
 64  offnet_mou_good_ph        99999 non-null  float64
 65  onnet_mou_good_ph         99999 non-null  float64
 66  og_others_good_ph         99999 non-null  float64
 67  ic_others_good_ph         99999 non-null  float64
 68  spl_og_mou_good_ph        99999 non-null  float64
 69  isd_og_mou_good_ph        99999 non-null  float64
 70  roam_og_mou_good_ph       99999 non-null  float64
 71  std_og_t2m_mou_good_ph    99999 non-null  float64
 72  std_og_t2t_mou_good_ph    99999 non-null  float64
 73  std_og_t2f_mou_good_ph    99999 non-null  float64
 74  loc_og_t2m_mou_good_ph    99999 non-null  float64
 75  loc_og_t2t_mou_good_ph    99999 non-null  float64
 76  loc_og_t2f_mou_good_ph    99999 non-null  float64
 77  loc_og_t2c_mou_good_ph    99999 non-null  float64
 78  total_ic_mou_good_ph      99999 non-null  float64
 79  total_og_mou_good_ph      99999 non-null  float64
 80  vbc_3g_good_ph            99999 non-null  float64
 81  std_ic_mou_good_ph        99999 non-null  float64
 82  loc_ic_mou_good_ph        99999 non-null  float64
 83  std_og_mou_good_ph        99999 non-null  float64
 84  loc_og_mou_good_ph        99999 non-null  float64
 85  vol_2g_mb_good_ph         99999 non-null  float64
 86  vol_3g_mb_good_ph         99999 non-null  float64
 87  churn                     99999 non-null  int64  
 88  action_required           99999 non-null  int64  
dtypes: float64(75), int64(11), object(3)
memory usage: 67.9+ MB
None

Deriving High Value Customers(HVC)¶

In [65]:
threshold_value=master_frame['tot_rech_amt_good_ph'].quantile(.70)
master_frame['high_value_customer']=master_frame['tot_rech_amt_good_ph'].apply(lambda x: True if x==True else ( True if x > threshold_value else False))
In [66]:
master_frame['high_value_customer'].value_counts()
Out[66]:
high_value_customer
False    70020
True     29979
Name: count, dtype: int64
In [67]:
master_frame['churn'].value_counts()
Out[67]:
churn
0    89808
1    10191
Name: count, dtype: int64
In [69]:
## Filtering HVC data from master_frame
master_frame_HVC=master_frame[master_frame['high_value_customer']==True]
In [70]:
master_frame_HVC.shape
Out[70]:
(29979, 90)
In [71]:
master_frame_HVC=master_frame_HVC.set_index('mobile_number',drop=True)
In [72]:
master_frame_HVC.head()
Out[72]:
total_rech_num_8 total_rech_amt_8 max_rech_amt_8 last_day_rch_amt_8 monthly_2g_8 sachet_2g_8 monthly_3g_8 sachet_3g_8 aon_years sachet_2g_good_ph ... vbc_3g_good_ph std_ic_mou_good_ph loc_ic_mou_good_ph std_og_mou_good_ph loc_og_mou_good_ph vol_2g_mb_good_ph vol_3g_mb_good_ph churn action_required high_value_customer
mobile_number
7000701601 7 3638 1580 779 0 0 0 0 2.0 0.0 ... 19.060 176.810 391.555 68.000 478.035 0.0 0.000 1 1 True
7001524846 14 120 30 10 0 3 0 0 0.0 0.5 ... 516.405 13.545 52.630 136.870 333.135 178.0 375.475 0 1 True
7002191713 11 717 130 0 0 3 0 0 7.0 0.0 ... 0.000 245.950 120.845 420.355 129.395 0.0 0.000 0 0 True
7000875565 2 160 130 130 0 0 0 0 1.0 0.0 ... 12.170 119.860 253.345 48.990 265.775 0.0 0.000 0 1 True
7000187447 4 30 30 0 0 0 0 0 1.0 0.0 ... 0.000 4.955 42.135 574.635 48.770 0.0 0.000 0 1 True

5 rows × 89 columns

In [74]:
#Churn counts for HVC customers
master_frame_HVC['churn'].value_counts()
Out[74]:
churn
0    27390
1     2589
Name: count, dtype: int64

EDA and univariate analysis¶

In [75]:
churn_percentage=(master_frame_HVC['churn'].value_counts()*100.0 /len(master_frame_HVC)),2
print(churn_percentage)

#churn_rate_graph = (master_frame_HVC['churn'].value_counts()*100.0 /len(master_frame_HVC)).plot(kind='bar',stacked = True,rot = 0,figsize = (8,6))                                                                                                                                                 
#churn_rate_graph.set_ylabel('% Customers',size = 14)
#churn_rate_graph.set_xlabel('Non_Churn Vs churn',size = 14)
#churn_rate_graph.set_title('churn Rate', size = 14)

sns.barplot(x="churn", y="churn", data=master_frame_HVC, estimator=lambda x: len(x) / len(master_frame_HVC) * 100)
plt.title("Non_Churn rate Vs churn rate")
(churn
0    91.363955
1     8.636045
Name: count, dtype: float64, 2)
Out[75]:
Text(0.5, 1.0, 'Non_Churn rate Vs churn rate')
In [76]:
#Churn comparisions on other columns
#sns.boxplot(x=df.churn, y="aon_years", data=df)
#plt.title("Churn stats based on age on network")
aon_churn_hvc_counts=master_frame_HVC.pivot_table(index=['aon_years'],values=['high_value_customer','churn'], aggfunc='sum')
print(aon_churn_hvc_counts)

#plt.subplot(1,1, 1)
usa=sns.barplot(x=aon_churn_hvc_counts.index, y="churn", data=aon_churn_hvc_counts)
plt.title("Churn count based on age on network")
           churn  high_value_customer
aon_years                            
0.0          672                 4853
1.0          818                 7616
2.0          504                 4661
3.0          197                 3044
4.0          110                 1980
5.0           51                 1251
6.0           78                 2069
7.0           73                 1826
8.0           53                 1523
9.0           19                  572
10.0          14                  556
11.0           0                   28
Out[76]:
Text(0.5, 1.0, 'Churn count based on age on network')
In [77]:
#Churn stats based on monthly ARPU
plt.figure(1)
plt.subplot(1,1, 1)
sns.boxplot(x=master_frame_HVC.churn, y="arpu_good_ph", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on arpu in good phase")

plt.figure(2)
plt.subplot(1, 2, 2)
sns.boxplot(x=master_frame_HVC.churn, y="arpu_8", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on arpu in August")
Out[77]:
Text(0.5, 1.0, 'Churn stats based on arpu in August')
In [78]:
#Churn stats based on monthly Total recharge
plt.figure(1)
plt.subplot(1,1, 1)
sns.boxplot(x=master_frame_HVC.churn, y="tot_rech_amt_good_ph", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on total recharge in good phase")

plt.figure(2)
plt.subplot(1, 2, 2)
sns.boxplot(x=master_frame_HVC.churn, y="total_rech_amt_8", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on total recharge in August")
Out[78]:
Text(0.5, 1.0, 'Churn stats based on total recharge in August')
In [79]:
#Churn stats based on day of last recharge

last_day_rech_hvc_count_6=master_frame_HVC.pivot_table(index=['day_of_last_rech_6'],values=['high_value_customer','churn'], aggfunc='sum')
last_day_rech_hvc_count_7=master_frame_HVC.pivot_table(index=['day_of_last_rech_7'],values=['high_value_customer','churn'], aggfunc='sum')
last_day_rech_hvc_count_8=master_frame_HVC.pivot_table(index=['day_of_last_rech_8'],values=['high_value_customer','churn'], aggfunc='sum')

plt.figure(1)
plt.rcParams['figure.figsize']=(5,5)
plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_6.index, y="churn", data=last_day_rech_hvc_count_6, )
plt.title("Churn stats based on day of last recharge in June")

plt.figure(2)
plt.rcParams['figure.figsize']=(5,5)
plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_7.index, y="churn", data=last_day_rech_hvc_count_7)
plt.title("Churn stats based on day of last recharge in July")

plt.figure(3)
plt.rcParams['figure.figsize']=(5,5)
#plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_8.index, y="churn", data=last_day_rech_hvc_count_8)
plt.title("Churn stats based on day of last recharge in August")
Out[79]:
Text(0.5, 1.0, 'Churn stats based on day of last recharge in August')

Conclusion¶

1.Churn rate is the highest for customers who are less thean 3 years with the operator.¶

2.The max revenue for churn customer is below the average revenue for non churn customer.¶

3.The max total recharge for churn customer is below the total recharge for non churn customer.¶

4.Since a typical recharge is valid for 1 month, we see the initial graph on a steady trend of recharge towards the end of the month. But, the trend changes for the next two month we observe intermediary recharges and increase in churn activity. Suggestion is to incorporate better benefits in the monthly recharges for stable customer satisfaction¶

Bivariate Analysis¶

In [83]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [84]:
# Let's see the correlation matrix 
plt.figure(figsize = (30,30))        # Size of the figure
sns.heatmap(master_frame_HVC.corr(),annot = True)
plt.show()

Conclusion¶

We see very few columns with high correlation in between. Further analysis is continued using PCA.¶

In [86]:
# To handle class imbalance.
logregcv = LogisticRegressionCV(class_weight='balanced',cv=20)

Test-Train Split¶

In [87]:
from sklearn.model_selection import train_test_split

X=master_frame_HVC.drop(['churn'],axis=1)

y=master_frame_HVC['churn']

y.head()
Out[87]:
mobile_number
7000701601    1
7001524846    0
7002191713    0
7000875565    0
7000187447    0
Name: churn, dtype: int64
In [88]:
X_train , X_test , y_train , y_test = train_test_split(X,y,train_size=0.7, random_state=42,stratify=y)
In [90]:
X_train.head()
Out[90]:
total_rech_num_8 total_rech_amt_8 max_rech_amt_8 last_day_rch_amt_8 monthly_2g_8 sachet_2g_8 monthly_3g_8 sachet_3g_8 aon_years sachet_2g_good_ph ... total_og_mou_good_ph vbc_3g_good_ph std_ic_mou_good_ph loc_ic_mou_good_ph std_og_mou_good_ph loc_og_mou_good_ph vol_2g_mb_good_ph vol_3g_mb_good_ph action_required high_value_customer
mobile_number
7000669104 6 546 130 0 0 0 0 0 4.0 0.0 ... 1848.215 0.000 45.270 77.005 1721.110 106.90 0.865 512.33 1 True
7000402026 8 824 154 150 1 0 0 0 2.0 0.0 ... 301.160 0.000 67.860 47.880 236.930 55.67 330.245 0.71 1 True
7001391178 21 764 130 23 0 0 0 0 3.0 0.0 ... 90.930 76.075 16.630 135.855 4.155 80.23 0.000 0.00 0 True
7002018307 6 650 130 130 0 0 0 0 6.0 0.0 ... 434.210 0.000 23.410 130.000 106.900 322.47 0.000 0.00 1 True
7001748476 11 605 169 0 0 0 0 0 1.0 0.0 ... 727.720 0.000 1.455 26.880 694.745 32.51 0.000 0.00 0 True

5 rows × 88 columns

In [91]:
X_train.shape
Out[91]:
(20985, 88)
In [92]:
X_test.shape
Out[92]:
(8994, 88)

Feature scaling in train and test Data¶

In [93]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)
In [94]:
X_train = pd.DataFrame(X_train, columns=X.columns.tolist())
X_test = pd.DataFrame(X_test, columns=X.columns.tolist())
In [95]:
X_train.head()
Out[95]:
total_rech_num_8 total_rech_amt_8 max_rech_amt_8 last_day_rch_amt_8 monthly_2g_8 sachet_2g_8 monthly_3g_8 sachet_3g_8 aon_years sachet_2g_good_ph ... total_og_mou_good_ph vbc_3g_good_ph std_ic_mou_good_ph loc_ic_mou_good_ph std_og_mou_good_ph loc_og_mou_good_ph vol_2g_mb_good_ph vol_3g_mb_good_ph action_required high_value_customer
0 -0.446222 -0.120083 -0.187419 -0.645018 -0.318315 -0.327589 -0.29811 -0.142157 0.382464 -0.322029 ... 1.898281 -0.349485 -0.050728 -0.563974 2.299885 -0.490779 -0.302375 0.328327 0.784442 0.0
1 -0.234334 0.410158 -0.050759 0.368976 2.470764 -0.327589 -0.29811 -0.142157 -0.354272 -0.322029 ... -0.650806 -0.349485 0.119928 -0.661131 -0.294499 -0.634671 0.963922 -0.352741 0.784442 0.0
2 1.142938 0.295717 -0.187419 -0.489539 -0.318315 -0.327589 -0.29811 -0.142157 0.014096 -0.322029 ... -0.997202 -0.145330 -0.267088 -0.367658 -0.701396 -0.565689 -0.305701 -0.353686 -1.274792 0.0
3 -0.446222 0.078281 -0.187419 0.233777 -0.318315 -0.327589 -0.29811 -0.142157 1.119200 -0.322029 ... -0.431579 -0.349485 -0.215869 -0.387189 -0.521795 0.114703 -0.305701 -0.353686 0.784442 0.0
4 0.083498 -0.007550 0.034653 -0.645018 -0.318315 -0.327589 -0.29811 -0.142157 -0.722640 -0.322029 ... 0.052038 -0.349485 -0.381728 -0.731184 0.505773 -0.699722 -0.305701 -0.353686 -1.274792 0.0

5 rows × 88 columns

In [99]:
X_test.head()
Out[99]:
total_rech_num_8 total_rech_amt_8 max_rech_amt_8 last_day_rch_amt_8 monthly_2g_8 sachet_2g_8 monthly_3g_8 sachet_3g_8 aon_years sachet_2g_good_ph ... total_og_mou_good_ph vbc_3g_good_ph std_ic_mou_good_ph loc_ic_mou_good_ph std_og_mou_good_ph loc_og_mou_good_ph vol_2g_mb_good_ph vol_3g_mb_good_ph action_required high_value_customer
0 -0.444856 -0.530691 -0.392343 -0.692552 -0.321494 1.283644 -0.295119 -0.148593 -0.714719 -0.027169 ... 0.032974 -0.353104 0.198055 -0.551233 0.022477 0.047114 0.582397 -0.309060 0.777585 0.0
1 -1.071519 -0.830938 -0.984200 -0.692552 -0.321494 -0.333467 -0.295119 -0.148593 1.122429 -0.335316 ... -0.506944 -0.353104 -0.357072 0.162981 -0.589694 0.108572 -0.317606 -0.309060 -1.286033 0.0
2 1.539577 0.336837 -0.579563 -0.253797 -0.321494 -0.333467 -0.295119 -0.148593 -0.714719 -0.335316 ... 0.759307 -0.353104 -0.140783 0.287297 0.987603 -0.292667 -0.317606 -0.309060 -1.286033 0.0
3 -0.549300 -0.033822 -0.078297 0.386354 2.499496 -0.333467 -0.295119 -0.148593 -0.714719 -0.335316 ... -0.490456 1.683022 -0.322943 2.799367 -0.694598 0.305518 -0.002491 0.623742 0.777585 0.0
4 -0.653744 0.165457 0.525639 1.105625 -0.321494 -0.333467 -0.295119 -0.148593 1.857289 -0.335316 ... 0.252899 -0.353104 0.838203 0.951118 -0.514479 1.271098 -0.317606 -0.309060 0.777585 0.0

5 rows × 88 columns

Model Building using PCA¶

Model Building : Model 1¶

Check if PCA n components with different number improve the model¶

90%¶

In [101]:
pca = PCA(0.90)
In [102]:
X_train_pca=pca.fit_transform(X_train)
In [103]:
X_train_pca.shape
Out[103]:
(20985, 40)
In [104]:
pca.components_
Out[104]:
array([[ 0.07633853,  0.22566864,  0.13663787, ...,  0.03945671,
        -0.03219828,  0.        ],
       [ 0.15513212,  0.09444172, -0.06123446, ..., -0.08169126,
        -0.04890042, -0.        ],
       [-0.05271368,  0.17236309,  0.21582728, ...,  0.27568083,
         0.00317266, -0.        ],
       ...,
       [ 0.20732827, -0.07185651,  0.00651994, ...,  0.09591715,
        -0.14249913,  0.        ],
       [-0.08092157,  0.04718852, -0.02644475, ...,  0.1039233 ,
         0.0244161 ,  0.        ],
       [-0.0472879 , -0.0264921 , -0.04764544, ...,  0.07186826,
        -0.32915986,  0.        ]])
In [105]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
Out[105]:
PC1 PC2 Feature
0 0.076339 0.155132 total_rech_num_8
1 0.225669 0.094442 total_rech_amt_8
2 0.136638 -0.061234 max_rech_amt_8
3 0.109444 -0.053867 last_day_rch_amt_8
4 0.014793 -0.062349 monthly_2g_8
5 -0.002771 0.011097 sachet_2g_8
6 0.056406 -0.070434 monthly_3g_8
7 0.006635 0.001019 sachet_3g_8
8 0.049381 -0.105510 aon_years
9 -0.013258 0.003086 sachet_2g_good_ph
10 0.000050 -0.005416 sachet_3g_good_ph
11 0.005280 -0.074147 monthly_2g_good_ph
12 0.042446 -0.077806 monthly_3g_good_ph
13 0.107142 -0.091951 max_rech_amt_good_ph
14 0.042225 0.142088 total_rech_num_good_ph
15 0.203252 0.088775 tot_rech_amt_good_ph
16 0.092732 -0.065286 last_day_rch_amt_good_ph
17 0.018674 0.083224 day_of_last_rech_6
18 0.045441 0.083241 day_of_last_rech_7
19 0.056599 0.089709 day_of_last_rech_8
20 0.227566 0.094874 arpu_8
21 0.114218 0.193629 onnet_mou_8
22 0.176213 0.145429 offnet_mou_8
23 0.014691 0.009111 roam_ic_mou_8
24 0.020832 0.034103 roam_og_mou_8
25 0.140692 -0.015786 loc_og_t2t_mou_8
26 0.200935 -0.078544 loc_og_t2m_mou_8
27 0.086528 -0.082607 loc_og_t2f_mou_8
28 0.018994 0.041399 loc_og_t2c_mou_8
29 0.223691 -0.064883 loc_og_mou_8
In [106]:
corrmat = np.corrcoef(X_train_pca.transpose())
In [107]:
corrmat.shape
Out[107]:
(40, 40)
In [108]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.4080458222073042e-15 , min corr:  -1.3545759810548892e-15
In [109]:
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
Out[109]:
(8994, 40)
In [110]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
In [111]:
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
Out[111]:
'0.89'
In [112]:
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
Out[112]:
'0.89'
In [113]:
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
In [114]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [115]:
confusion_matrix(y_train,y_train_pred_bool)
Out[115]:
array([[15923,  3250],
       [  313,  1499]], dtype=int64)
In [116]:
confusion_matrix(y_test,y_pred_bool)
Out[116]:
array([[6856, 1361],
       [ 139,  638]], dtype=int64)
In [117]:
precision_score(y_test,y_pred_bool)
Out[117]:
0.31915957978989495
In [118]:
recall_score(y_test,y_pred_bool)
Out[118]:
0.8211068211068211
In [119]:
accuracy_score(y_test,y_pred_bool)
Out[119]:
0.8332221480987325
In [120]:
#Function to calculate Model Metrics

def get_model_metrics(actual,predicted):
    """
    Print Summary Metrics of the Model

    Parameters:
    actual (pandas.core.series.Series): Series of Boolean values for target column
    predicted (pandas.core.series.Series): Series of Boolean values for Model predicted the target column
    
    Returns:
    None

   """

    
    confusion = metrics.confusion_matrix(actual,predicted )
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    accuracy=metrics.accuracy_score(actual,predicted)
    sensitivity = TP / float(TP+FN)
    specificity = TN / float(TN+FP)
    precision=TP/float(FP+TP)
    recall=TP/float(FN+TP)
    
    precision=metrics.precision_score(actual,predicted)
    recall=metrics.recall_score(actual,predicted)
    f1=metrics.f1_score(actual,predicted, average='weighted') 
    print("Accuracy    : "+str(round(accuracy,2)))
    print("Sensitivity : "+str(round(sensitivity,2)))
    print("Specificity : "+str(round(specificity,2)))
    print("Precision   : "+str(round(precision,2)))
    print("Recall      : "+str(round(recall,2)))
    print("F1_score    : "+str(round(f1,2)))

def get_cross_validated_model_metrics(X,y,cv=5):
    """
    Get cross validated model metric for k folds

    Parameters:
    X (pandas.core.frame.DataFrame): DF of all the features excluding target column
    y (pandas.core.series.Series): Series of Boolean values of the target column
    
    Returns:
    df (pandas.core.frame.DataFrame): DF will all the metric for k fold

   """
    accuracy=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='accuracy')
    precision=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='precision')
    recall=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='recall')
    f1_weighted=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='f1_weighted')
    roc_auc=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='roc_auc')
    df=pd.DataFrame(
    {'accuracy': accuracy,
     'precision': precision,
     'recall': recall,
     'f1_weighted': f1_weighted,
     'roc_auc': roc_auc,
    })
    return df

Model 1 metric Summary¶

In [121]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.83
Sensitivity : 0.82
Specificity : 0.83
Precision   : 0.32
Recall      : 0.82
F1_score    : 0.86

Model 2¶

Check if PCA n components with different number improve the model¶

85%¶

In [122]:
pca = PCA(0.85)
In [127]:
X_train_pca=pca.fit_transform(X_train)
In [124]:
X_train_pca.shape
Out[124]:
(20985, 33)
In [126]:
pca.components_
Out[126]:
array([[ 0.07633853,  0.22566864,  0.13663787, ...,  0.03945671,
        -0.03219828,  0.        ],
       [ 0.15513212,  0.09444172, -0.06123446, ..., -0.08169126,
        -0.04890042, -0.        ],
       [-0.05271368,  0.17236309,  0.21582728, ...,  0.27568083,
         0.00317266, -0.        ],
       ...,
       [ 0.08794035,  0.08126948, -0.063038  , ..., -0.01767814,
         0.24643063,  0.        ],
       [-0.0551191 , -0.14849084, -0.10057557, ...,  0.11180516,
         0.09907325, -0.        ],
       [-0.3403509 , -0.03490944,  0.04207936, ..., -0.01395101,
        -0.02831194,  0.        ]])
In [128]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head()
Out[128]:
PC1 PC2 Feature
0 0.076339 0.155132 total_rech_num_8
1 0.225669 0.094442 total_rech_amt_8
2 0.136638 -0.061234 max_rech_amt_8
3 0.109444 -0.053867 last_day_rch_amt_8
4 0.014793 -0.062349 monthly_2g_8
In [129]:
corrmat = np.corrcoef(X_train_pca.transpose())
In [130]:
corrmat.shape
Out[130]:
(33, 33)
In [131]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.0844956478739332e-15 , min corr:  -1.3545759810548892e-15
In [132]:
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
Out[132]:
(8994, 33)
In [133]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
In [134]:
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
Out[134]:
'0.89'
In [135]:
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
Out[135]:
'0.89'
In [136]:
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
In [137]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [138]:
confusion_matrix(y_test,y_pred_bool)
Out[138]:
array([[6805, 1412],
       [ 142,  635]], dtype=int64)
In [139]:
confusion_matrix(y_train,y_train_pred_bool)
Out[139]:
array([[15828,  3345],
       [  325,  1487]], dtype=int64)
In [140]:
import  sklearn.metrics
In [141]:
precision_score(y_test,y_pred_bool)
Out[141]:
0.3102100635075721
In [142]:
recall_score(y_test,y_pred_bool)
Out[142]:
0.8172458172458172
In [143]:
accuracy_score(y_test,y_pred_bool)
Out[143]:
0.8272181454302868

Model 2 Metric Summary¶

In [144]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.83
Sensitivity : 0.82
Specificity : 0.83
Precision   : 0.31
Recall      : 0.82
F1_score    : 0.86

Model 3¶

Check if PCA n components with different number improve the model¶

80%¶

In [146]:
pca = PCA(0.80)
In [147]:
X_train_pca=pca.fit_transform(X_train)
In [148]:
X_train_pca.shape
Out[148]:
(20985, 28)
In [149]:
pca.components_
Out[149]:
array([[ 0.07633853,  0.22566864,  0.13663787, ...,  0.03945671,
        -0.03219828,  0.        ],
       [ 0.15513212,  0.09444172, -0.06123446, ..., -0.08169126,
        -0.04890042, -0.        ],
       [-0.05271368,  0.17236309,  0.21582728, ...,  0.27568083,
         0.00317266, -0.        ],
       ...,
       [-0.04948918,  0.04932912, -0.04596317, ..., -0.04633158,
        -0.04737415, -0.        ],
       [-0.08773346, -0.01372893, -0.01463961, ..., -0.00117623,
         0.20125531, -0.        ],
       [-0.13545784, -0.04061064, -0.12696834, ...,  0.01968636,
         0.06213658,  0.        ]])
In [150]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head()
Out[150]:
PC1 PC2 Feature
0 0.076339 0.155132 total_rech_num_8
1 0.225669 0.094442 total_rech_amt_8
2 0.136638 -0.061234 max_rech_amt_8
3 0.109444 -0.053867 last_day_rch_amt_8
4 0.014793 -0.062349 monthly_2g_8
In [151]:
corrmat = np.corrcoef(X_train_pca.transpose())
In [152]:
corrmat.shape
Out[152]:
(28, 28)
In [153]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.0844956478739332e-15 , min corr:  -1.5184103551529068e-15
In [154]:
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
Out[154]:
(8994, 28)
In [155]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
In [156]:
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
Out[156]:
'0.88'
In [157]:
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
Out[157]:
'0.88'
In [158]:
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
In [159]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [160]:
confusion_matrix(y_test,y_pred_bool)
Out[160]:
array([[6843, 1374],
       [ 144,  633]], dtype=int64)
In [161]:
confusion_matrix(y_train,y_train_pred_bool)
Out[161]:
array([[15872,  3301],
       [  347,  1465]], dtype=int64)
In [162]:
import  sklearn.metrics
In [163]:
precision_score(y_test,y_pred_bool)
Out[163]:
0.3153961136023916
In [164]:
recall_score(y_test,y_pred_bool)
Out[164]:
0.8146718146718147
In [165]:
accuracy_score(y_test,y_pred_bool)
Out[165]:
0.8312208138759173

Model 3 Metric Summary¶

In [167]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.83
Sensitivity : 0.81
Specificity : 0.83
Precision   : 0.32
Recall      : 0.81
F1_score    : 0.86

N components with 80% 85% and 90% doesnt have much impact. We will go with PCA components as 90%¶

In [168]:
X_train.shape
Out[168]:
(20985, 88)
In [169]:
#Improting the PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)
In [170]:
X_train_pca=pca.fit_transform(X_train)
In [171]:
X_train_pca.shape
Out[171]:
(20985, 88)
In [172]:
pca.components_
Out[172]:
array([[ 7.63385296e-02,  2.25668637e-01,  1.36637874e-01, ...,
         3.94567053e-02, -3.21982809e-02,  0.00000000e+00],
       [ 1.55132118e-01,  9.44417182e-02, -6.12344615e-02, ...,
        -8.16912579e-02, -4.89004224e-02,  0.00000000e+00],
       [-5.27136831e-02,  1.72363091e-01,  2.15827278e-01, ...,
         2.75680826e-01,  3.17265679e-03, -0.00000000e+00],
       ...,
       [-5.38011643e-08,  1.69099296e-07, -1.18763814e-07, ...,
         1.10214967e-08,  1.16673514e-07,  0.00000000e+00],
       [-7.49304793e-08,  1.20074454e-07, -3.78340539e-08, ...,
        -4.04387345e-08,  4.89967732e-10,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])
In [173]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
Out[173]:
PC1 PC2 Feature
0 0.076339 0.155132 total_rech_num_8
1 0.225669 0.094442 total_rech_amt_8
2 0.136638 -0.061234 max_rech_amt_8
3 0.109444 -0.053867 last_day_rch_amt_8
4 0.014793 -0.062349 monthly_2g_8
5 -0.002771 0.011097 sachet_2g_8
6 0.056406 -0.070434 monthly_3g_8
7 0.006635 0.001019 sachet_3g_8
8 0.049381 -0.105510 aon_years
9 -0.013258 0.003086 sachet_2g_good_ph
10 0.000050 -0.005416 sachet_3g_good_ph
11 0.005280 -0.074147 monthly_2g_good_ph
12 0.042446 -0.077806 monthly_3g_good_ph
13 0.107142 -0.091951 max_rech_amt_good_ph
14 0.042225 0.142088 total_rech_num_good_ph
15 0.203252 0.088775 tot_rech_amt_good_ph
16 0.092732 -0.065286 last_day_rch_amt_good_ph
17 0.018674 0.083224 day_of_last_rech_6
18 0.045441 0.083241 day_of_last_rech_7
19 0.056599 0.089709 day_of_last_rech_8
20 0.227566 0.094874 arpu_8
21 0.114218 0.193629 onnet_mou_8
22 0.176213 0.145429 offnet_mou_8
23 0.014691 0.009111 roam_ic_mou_8
24 0.020832 0.034103 roam_og_mou_8
25 0.140692 -0.015786 loc_og_t2t_mou_8
26 0.200935 -0.078544 loc_og_t2m_mou_8
27 0.086528 -0.082607 loc_og_t2f_mou_8
28 0.018994 0.041399 loc_og_t2c_mou_8
29 0.223691 -0.064883 loc_og_mou_8
In [174]:
pca.explained_variance_ratio_
Out[174]:
array([1.05523190e-01, 9.35620165e-02, 6.07000296e-02, 4.87794462e-02,
       4.36956587e-02, 3.84396347e-02, 3.10280137e-02, 2.91858432e-02,
       2.81325159e-02, 2.74104374e-02, 2.63263170e-02, 2.52636769e-02,
       2.42871750e-02, 2.30059535e-02, 2.13248453e-02, 2.02745458e-02,
       1.88465408e-02, 1.67142391e-02, 1.47504540e-02, 1.43684902e-02,
       1.39366438e-02, 1.34973248e-02, 1.30978332e-02, 1.24556228e-02,
       1.20125318e-02, 1.07138470e-02, 1.00918448e-02, 9.86099508e-03,
       9.73066836e-03, 9.44571953e-03, 9.28097814e-03, 8.58852311e-03,
       8.52420458e-03, 8.08950927e-03, 7.99544103e-03, 7.68923889e-03,
       7.36964213e-03, 7.23625935e-03, 6.87277474e-03, 6.81889637e-03,
       6.64771908e-03, 6.18745755e-03, 5.77083420e-03, 5.35067544e-03,
       5.10346473e-03, 4.96276039e-03, 4.55836387e-03, 3.74719120e-03,
       3.65611275e-03, 3.49448213e-03, 3.39702242e-03, 3.28286973e-03,
       3.14252201e-03, 3.05116189e-03, 3.00821935e-03, 2.74749351e-03,
       2.67793486e-03, 2.63332878e-03, 2.51357162e-03, 2.43411028e-03,
       2.39974934e-03, 2.29246439e-03, 2.14026916e-03, 2.09463542e-03,
       2.07176829e-03, 1.67123315e-03, 1.57515410e-03, 8.73261202e-04,
       8.16787618e-04, 4.00894160e-04, 2.62314395e-04, 7.50628962e-05,
       2.69947183e-05, 3.68946684e-06, 9.03722548e-07, 1.16342153e-11,
       5.22446200e-12, 4.11473552e-12, 2.11395763e-12, 1.73379951e-12,
       1.54089522e-12, 9.75475645e-13, 8.98816972e-13, 7.05871034e-13,
       4.17916770e-13, 2.84710775e-13, 1.80504538e-13, 4.17814700e-34])
In [175]:
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

N components analysis: Based on skree graph we will take 40 n components.¶

40 components represent 90% of the varaiance of the data¶

In [177]:
from sklearn.decomposition import IncrementalPCA
pca = IncrementalPCA(n_components=40)
In [178]:
X_train_pca=pca.fit_transform(X_train)
In [179]:
X_train_pca.shape
Out[179]:
(20985, 40)
In [180]:
pca.components_
Out[180]:
array([[ 0.07629758,  0.2256092 ,  0.13664919, ...,  0.03949844,
        -0.03227046, -0.        ],
       [ 0.15517044,  0.09443407, -0.06122161, ..., -0.08173608,
        -0.04885412,  0.        ],
       [-0.05255816,  0.17220162,  0.21603692, ...,  0.27583064,
         0.00318245, -0.        ],
       ...,
       [-0.01347025, -0.01290335,  0.04479986, ...,  0.02463924,
         0.08937706,  0.        ],
       [ 0.08466575, -0.05716234,  0.01075553, ..., -0.24198979,
        -0.15457304,  0.        ],
       [-0.078044  , -0.01481756,  0.07862538, ...,  0.28532987,
        -0.19249444,  0.        ]])
In [181]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
Out[181]:
PC1 PC2 Feature
0 0.076298 0.155170 total_rech_num_8
1 0.225609 0.094434 total_rech_amt_8
2 0.136649 -0.061222 max_rech_amt_8
3 0.109520 -0.053851 last_day_rch_amt_8
4 0.014749 -0.062273 monthly_2g_8
5 -0.002801 0.011119 sachet_2g_8
6 0.056479 -0.070432 monthly_3g_8
7 0.006599 0.001022 sachet_3g_8
8 0.049391 -0.105521 aon_years
9 -0.013189 0.003058 sachet_2g_good_ph
10 0.000099 -0.005414 sachet_3g_good_ph
11 0.005376 -0.074215 monthly_2g_good_ph
12 0.042528 -0.077735 monthly_3g_good_ph
13 0.107139 -0.091909 max_rech_amt_good_ph
14 0.042244 0.142112 total_rech_num_good_ph
15 0.203216 0.088813 tot_rech_amt_good_ph
16 0.092682 -0.065364 last_day_rch_amt_good_ph
17 0.018665 0.083224 day_of_last_rech_6
18 0.045425 0.083261 day_of_last_rech_7
19 0.056687 0.089654 day_of_last_rech_8
20 0.227478 0.094871 arpu_8
21 0.114232 0.193582 onnet_mou_8
22 0.176185 0.145456 offnet_mou_8
23 0.014779 0.009159 roam_ic_mou_8
24 0.020704 0.034018 roam_og_mou_8
25 0.140618 -0.015793 loc_og_t2t_mou_8
26 0.200931 -0.078521 loc_og_t2m_mou_8
27 0.086519 -0.082582 loc_og_t2f_mou_8
28 0.019088 0.041479 loc_og_t2c_mou_8
29 0.223641 -0.064872 loc_og_mou_8
In [182]:
%matplotlib inline
fig = plt.figure(figsize = (8,8))
plt.scatter(pcs_df.PC1, pcs_df.PC2)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
for i, txt in enumerate(pcs_df.Feature):
    plt.annotate(txt, (pcs_df.PC1[i],pcs_df.PC2[i]))
plt.tight_layout()
plt.show()
In [183]:
pca.explained_variance_ratio_
Out[183]:
array([0.10552123, 0.09356073, 0.06069537, 0.0487609 , 0.04369076,
       0.03843441, 0.03098464, 0.02915601, 0.02809563, 0.0273662 ,
       0.02627115, 0.02524961, 0.02425371, 0.02290334, 0.02125699,
       0.02026422, 0.01883319, 0.01656114, 0.01469295, 0.01428665,
       0.01381256, 0.01341614, 0.01264494, 0.0122565 , 0.0119078 ,
       0.01067961, 0.0100218 , 0.00979932, 0.00938025, 0.00924034,
       0.00855022, 0.00827311, 0.00802292, 0.0074819 , 0.0073239 ,
       0.00710277, 0.00677336, 0.00647   , 0.00622158, 0.0057558 ])
In [184]:
%matplotlib inline
fig = plt.figure(figsize = (8,8))
plt.scatter(X_train_pca[:,0], X_train_pca[:,1], c = y_train.map({0:'green',1:'red'}))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.tight_layout()
plt.show()
In [185]:
X_train.head()
Out[185]:
total_rech_num_8 total_rech_amt_8 max_rech_amt_8 last_day_rch_amt_8 monthly_2g_8 sachet_2g_8 monthly_3g_8 sachet_3g_8 aon_years sachet_2g_good_ph ... total_og_mou_good_ph vbc_3g_good_ph std_ic_mou_good_ph loc_ic_mou_good_ph std_og_mou_good_ph loc_og_mou_good_ph vol_2g_mb_good_ph vol_3g_mb_good_ph action_required high_value_customer
0 -0.446222 -0.120083 -0.187419 -0.645018 -0.318315 -0.327589 -0.29811 -0.142157 0.382464 -0.322029 ... 1.898281 -0.349485 -0.050728 -0.563974 2.299885 -0.490779 -0.302375 0.328327 0.784442 0.0
1 -0.234334 0.410158 -0.050759 0.368976 2.470764 -0.327589 -0.29811 -0.142157 -0.354272 -0.322029 ... -0.650806 -0.349485 0.119928 -0.661131 -0.294499 -0.634671 0.963922 -0.352741 0.784442 0.0
2 1.142938 0.295717 -0.187419 -0.489539 -0.318315 -0.327589 -0.29811 -0.142157 0.014096 -0.322029 ... -0.997202 -0.145330 -0.267088 -0.367658 -0.701396 -0.565689 -0.305701 -0.353686 -1.274792 0.0
3 -0.446222 0.078281 -0.187419 0.233777 -0.318315 -0.327589 -0.29811 -0.142157 1.119200 -0.322029 ... -0.431579 -0.349485 -0.215869 -0.387189 -0.521795 0.114703 -0.305701 -0.353686 0.784442 0.0
4 0.083498 -0.007550 0.034653 -0.645018 -0.318315 -0.327589 -0.29811 -0.142157 -0.722640 -0.322029 ... 0.052038 -0.349485 -0.381728 -0.731184 0.505773 -0.699722 -0.305701 -0.353686 -1.274792 0.0

5 rows × 88 columns

In [186]:
corrmat = np.corrcoef(X_train_pca.transpose())
In [187]:
corrmat.shape
Out[187]:
(40, 40)
In [188]:
#plotting the correlation matrix
%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(corrmat[0:19,0:19],annot = True)
Out[188]:
<Axes: >
In [189]:
%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(corrmat[19:40,19:40],annot = True)
Out[189]:
<Axes: >
In [190]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 0.13631017903169826 , min corr:  -0.05370040292263015
In [191]:
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
Out[191]:
(8994, 40)

MODEL 4 using LogisticRegressionCV¶

In [192]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics

log_reg_cv = LogisticRegressionCV(class_weight='balanced',cv=20)
model_pca = log_reg_cv.fit(X_train_pca,y_train)
In [194]:
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
Out[194]:
'0.89'
In [195]:
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
Out[195]:
'0.89'
In [196]:
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
In [197]:
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
In [198]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [199]:
confusion_matrix(y_test,y_pred_bool)
Out[199]:
array([[6856, 1361],
       [ 142,  635]], dtype=int64)
In [200]:
confusion_matrix(y_train,y_train_pred_bool)
Out[200]:
array([[15924,  3249],
       [  322,  1490]], dtype=int64)
In [201]:
precision_score(y_test,y_pred_bool)
Out[201]:
0.3181362725450902
In [202]:
recall_score(y_test,y_pred_bool)
Out[202]:
0.8172458172458172
In [203]:
accuracy_score(y_test,y_pred_bool)
Out[203]:
0.83288859239493

Metric Summary¶

In [204]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.83
Sensitivity : 0.82
Specificity : 0.83
Precision   : 0.32
Recall      : 0.82
F1_score    : 0.86

Final Model¶

In [205]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
In [206]:
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
Out[206]:
'0.89'
In [207]:
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
Out[207]:
'0.89'
In [208]:
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
In [209]:
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
In [210]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [211]:
confusion_matrix(y_test,y_pred_bool)
Out[211]:
array([[6851, 1366],
       [ 138,  639]], dtype=int64)
In [212]:
confusion_matrix(y_train,y_train_pred_bool)
Out[212]:
array([[15883,  3290],
       [  311,  1501]], dtype=int64)
In [213]:
import  sklearn.metrics
In [214]:
precision_score(y_test,y_pred_bool)
Out[214]:
0.31870324189526184
In [215]:
recall_score(y_test,y_pred_bool)
Out[215]:
0.8223938223938224
In [216]:
accuracy_score(y_test,y_pred_bool)
Out[216]:
0.8327774071603291
In [217]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None
In [218]:
fpr, tpr, thresholds = metrics.roc_curve( y_train, y_train_pred, drop_intermediate = False )
In [219]:
draw_roc(y_train, y_train_pred)
In [220]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

confusion=confusion_matrix(y_train,y_train_pred_bool)

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

def threshold_optimization(y,num):
    #pass the prediction_probability series and threshold probabilty cutoff
    y_bool= y.apply(lambda x: 1 if x>=num else 0)
    return y_bool

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train,threshold_optimization(y_train_pred,i))
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)
     prob  accuracy     sensi     speci
0.0   0.0  0.086347  1.000000  0.000000
0.1   0.1  0.371265  0.979029  0.313827
0.2   0.2  0.542578  0.953642  0.503729
0.3   0.3  0.665142  0.915011  0.641527
0.4   0.4  0.761449  0.871965  0.751004
0.5   0.5  0.828401  0.828366  0.828405
0.6   0.6  0.871432  0.764901  0.881500
0.7   0.7  0.899309  0.682119  0.919835
0.8   0.8  0.912890  0.498896  0.952016
0.9   0.9  0.919228  0.286976  0.978981
In [221]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

Model Threshold optimization¶

Threshold Model Summary with cutoff as 0.3¶

In [222]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.3 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.3 else 0)
In [223]:
cm_test=confusion_matrix(y_test,y_pred_bool)
In [224]:
cm_train=confusion_matrix(y_train,y_train_pred_bool)
In [225]:
cm_train
Out[225]:
array([[12300,  6873],
       [  154,  1658]], dtype=int64)
In [226]:
cm_test
Out[226]:
array([[5306, 2911],
       [  63,  714]], dtype=int64)
In [227]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.67
Sensitivity : 0.92
Specificity : 0.65
Precision   : 0.2
Recall      : 0.92
F1_score    : 0.74

Threshold Model Summary with cutoff as 0.4¶

In [228]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.4 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.4 else 0)
In [229]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.77
Sensitivity : 0.87
Specificity : 0.76
Precision   : 0.25
Recall      : 0.87
F1_score    : 0.81

Threshold Model Summary with cutoff as 0.5¶

In [230]:
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
In [231]:
get_model_metrics(y_test,y_pred_bool)
Accuracy    : 0.83
Sensitivity : 0.82
Specificity : 0.83
Precision   : 0.32
Recall      : 0.82
F1_score    : 0.86
In [232]:
cross_validated_train_metrics=get_cross_validated_model_metrics(X_train_pca,y_train,cv=5)
In [233]:
cross_validated_train_metrics
Out[233]:
accuracy precision recall f1_weighted roc_auc
0 0.832261 0.317256 0.815427 0.862413 0.887782
1 0.822254 0.308308 0.848485 0.855716 0.892362
2 0.830593 0.314952 0.820442 0.861362 0.885959
3 0.820110 0.301715 0.825967 0.853889 0.887532
4 0.834882 0.318332 0.801105 0.864122 0.878673
In [234]:
cross_validated_train_metrics.mean()
Out[234]:
accuracy       0.828020
precision      0.312113
recall         0.822285
f1_weighted    0.859500
roc_auc        0.886461
dtype: float64
In [ ]:
cross_validated_test_metrics=get_cross_validated_model_metrics(X_test_pca,y_test,cv=5)
In [ ]:
cross_validated_test_metrics
In [ ]:
cross_validated_test_metrics.mean()

Hence, we have a very stable model which will predict the churn¶

In [ ]: